library(tidyverse)
library(dplyr)
library(ggplot2)
library(rgdal)
library(tmap)
library(readxl)
library(ggrepel)
library(ggthemes)
library(scales)
library(sf)
library(raster)
library(rgeos)
library(ggmap)
library(DT)
library(devtools)
ggmap::register_google(key = "AIzaSyB1-MjiXEIrgdT3FbflMLc8EUaQXVG3XVY")
devtools::install_github("rstudio/leaflet")

Parking Violations in NYC

fines <- read_excel("ParkingViolationCodes_January2020.xlsx")
1. Data exploration
head(df)
colnames(df)
 [1] "Summons Number"                    "Plate ID"                          "Registration State"               
 [4] "Plate Type"                        "Issue Date"                        "Violation Code"                   
 [7] "Vehicle Body Type"                 "Vehicle Make"                      "Issuing Agency"                   
[10] "Street Code1"                      "Street Code2"                      "Street Code3"                     
[13] "Vehicle Expiration Date"           "Violation Location"                "Violation Precinct"               
[16] "Issuer Precinct"                   "Issuer Code"                       "Issuer Command"                   
[19] "Issuer Squad"                      "Violation Time"                    "Time First Observed"              
[22] "Violation County"                  "Violation In Front Of Or Opposite" "House Number"                     
[25] "Street Name"                       "Intersecting Street"               "Date First Observed"              
[28] "Law Section"                       "Sub Division"                      "Violation Legal Code"             
[31] "Days Parking In Effect"            "From Hours In Effect"              "To Hours In Effect"               
[34] "Vehicle Color"                     "Unregistered Vehicle?"             "Vehicle Year"                     
[37] "Meter Number"                      "Feet From Curb"                    "Violation Post Code"              
[40] "Violation Description"             "No Standing or Stopping Violation" "Hydrant Violation"                
[43] "Double Parking Violation"          "issue_date"                        "year"                             
[46] "month"                             "day"                              
df <- dplyr::select(df, -"Violation Description")
colnames(fines) <- str_to_title(colnames(fines))
fines <- fines[c(1:3)]
colnames(fines)[3] <- "Fine Amount $"
head(fines)

Violation 38 [FAIL TO DSPLY MUNI METER RECPT] and 69 [FAIL TO DISP. MUNI METER RECPT] look like the same infraction. The fine is $65 for both. I’d rename one as the other and aggregate them as the same violation.

fines$`Violation Description`[69] <- "FAIL TO DSPLY MUNI METER RECPT"
df <- merge(x = df, y = fines, by = "Violation Code", all.x = TRUE)
a) Violation Code and Fine Amounts

Add the violation code descriptions and fine amounts to the data file. Provide a visual overview of the top 10 most common types of violations (feel free to group them into categories if reasonable). Compare how this ranking differs if we focus on the total amount of revenue generated.

Top 10 Violations

one_a <- df %>%
  dplyr::select(`Violation Description`) %>%
  group_by(`Violation Description`) %>%
  count() %>%
  ungroup() %>%
  arrange(desc(n)) %>%
  head(10)
legend_ord <- levels(with(one_a, reorder(`Violation Description`, desc(n))))
one_a %>%
  ggplot(aes(x = reorder(`Violation Description`, desc(n)), y = n, fill = `Violation Description`)) +
  geom_col(alpha = 0.8) +
  scale_fill_brewer(palette = "RdBu", breaks = legend_ord) +
  
  labs(
  title = "Top 10 most common types of violations in NYC",
  x = "",
  y = "Number of Violations",
  x.axis = "",
  caption = "Source: data.cityofnewyork.us"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma)

Top 10 Violations for revenue

one_a_2 <- df %>%
  dplyr::select(`Violation Description`, `Fine Amount $`) %>%
  group_by(`Violation Description`,`Fine Amount $`) %>% 
  summarize(`Total Fines Amount $` = sum(`Fine Amount $`)) %>%
  arrange(desc(`Total Fines Amount $`)) %>%
  head(10)
`summarise()` has grouped output by 'Violation Description'. You can override using the `.groups` argument.
legend_ord2 <- levels(with(one_a_2, reorder(`Violation Description`, desc(`Total Fines Amount $`))))
one_a_2 %>%
  ggplot(aes(x = reorder(`Violation Description`, desc(`Total Fines Amount $`)), y = `Total Fines Amount $`, fill = `Violation Description`)) +
  geom_col(alpha = 0.8) +
  scale_fill_brewer(palette = "RdBu", breaks = legend_ord2) +
  
  labs(
  title = "Top 10 highest violations in NYC per revenue",
  x = "",
  y = "Dollar ($)",
  x.axis = "",
  caption = "Source: data.cityofnewyork.us"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma)

b) Average amount of fine by vehicle

Compare the average amount of fine by vehicle color, vehicle year, and vehicle plate type [Hint: it is sufficient to restrict your attention to commercial (COM) and passenger (PAS) vehicles]? Briefly describe your findings.

Vehicle Color

one_b_1 <- df %>%
  dplyr::select(`Vehicle Color`, `Fine Amount $`) %>%
  drop_na(`Fine Amount $`) %>%
  group_by(`Vehicle Color`) %>%
  add_count(`Vehicle Color`) %>%
  filter(n > 10) %>%
  summarize(`Average Fine` = mean(`Fine Amount $`)) %>%
  arrange(desc(`Average Fine`)) %>%
  head(10)
legend_ord3 <- levels(with(one_b_1, reorder(`Vehicle Color`, desc(`Average Fine`))))
one_b_1 %>%
  ggplot(aes(x = reorder(`Vehicle Color`, desc(`Average Fine`)), y = `Average Fine`, fill = `Vehicle Color`)) +
  geom_col(alpha = 0.8) +
  scale_fill_brewer(palette = "RdBu", breaks = legend_ord3) +
  
  labs(
  title = "Top 10 highest Average Fine per Car Color",
  x = "",
  y = "Dollar ($)",
  x.axis = "",
  caption = "Source: data.cityofnewyork.us"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma)

Vehicle Year

one_b_2 <- df %>%
  dplyr::select(`Vehicle Year`, `Fine Amount $`) %>%
  group_by(`Vehicle Year`) %>%
  summarize(`Average Fine` = mean(`Fine Amount $`)) %>%
  filter(`Vehicle Year` > 0 & `Vehicle Year` <= 2021) %>%
  arrange(desc(`Average Fine`)) %>%
  head(10)
legend_ord4 <- levels(with(one_b_2, reorder(`Vehicle Year`, desc(`Average Fine`))))
one_b_2 %>%
  ggplot(aes(x = reorder(`Vehicle Year`, desc(`Average Fine`)), y = `Average Fine`, fill = as.factor(`Vehicle Year`))) +
  geom_col(alpha = 0.8) +
  scale_fill_brewer(palette = "RdBu", breaks = legend_ord3) +
  
  labs(
  title = "Top 10 highest Average Fine per Vehicle Registration Year",
  x = "Registration Year",
  y = "Dollar ($)",
  x.axis = "",
  caption = "Source: data.cityofnewyork.us"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma)

Vehicle Plate Type

one_b_3 <- df %>%
  dplyr::select(`Plate Type`, `Fine Amount $`) %>%
  group_by(`Plate Type`) %>%
  filter(`Plate Type` == "COM" | `Plate Type` == "PAS") %>%
  summarize(`Average Fine` = mean(`Fine Amount $`, na.rm=TRUE)) %>%
  arrange(desc(`Average Fine`))
one_b_3 %>%
  ggplot(aes(x = reorder(`Plate Type`, desc(`Average Fine`)), y = `Average Fine`, fill = `Plate Type`)) +
  geom_col(alpha = 0.8) +
  scale_fill_brewer(palette = "RdYlBu") +
  
  geom_hline(yintercept=89.704) +
  
  labs(
  title = "Average Fine per Car Type",
  x = "",
  y = "Dollar ($)",
  x.axis = "",
  caption = "Source: data.cityofnewyork.us"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma, limits = c(0, 100))

c) Effect of COVID

Let’s see if we can observe the effect of COVID restrictions on parking violations. Present a visualization that shows how parking violations changed after the New York statewide stay-at-home order on March 14, 2020. Make sure the visualization clearly highlights the main pattern (the COVID effect).

one_c <- df %>%
  filter(`year` == 2020) %>%
  dplyr::select(`month`, `Violation Description`) %>%
  add_count(`Violation Description`) %>%
  group_by(`month`) %>%
  summarize(Mean = mean(n))
one_c %>%

  ggplot(aes(x = `month`, y = Mean)) +
  
  geom_line() +
  geom_vline(xintercept=3.5) +
  xlim(c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) +
  ylim(50000,200000) +
  
  geom_label(
  label="New York statewide\nstay-at-home order", 
  x=2.2,
  y=145000,
  label.padding = unit(0.55, "lines"),
  label.size = 0.15,
  color = "black",
  fill="lightcyan"
  ) +
  
  labs(
    title = "NY Violations in 2020",
    x = "2020 Months", 
    y = "Average Violations per Month",
    caption = "Source: data.cityofnewyork.us"
    ) + 
  
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect()
    ) 

I tried my best (!), but I can’t really say there has been a “COVID effect”. Violations appear to be lower in the first months of 2020 compared to the summer. Probably snow and hard weather have an harder impact on traffic than Covid restrictions.


2. Map by Precincts

Read in the shape files for the police precincts and remove all precincts outside of Manhattan.

setwd("/Users/davidev7/Documents/Columbia/Second Semester/Data Visualization/course_content-main/Exercises/07_parking-graded/data")
file.exists('/Users/davidev7/Documents/Columbia/Second Semester/Data Visualization/course_content-main/Exercises/07_parking-graded/data/nypp_21a')

nypp <- readOGR(dsn = '/Users/davidev7/Documents/Columbia/Second Semester/Data Visualization/course_content-main/Exercises/07_parking-graded/data/nypp_21a', layer = "nypp")
str(nypp,max.level = 2)
Formal class 'SpatialPolygonsDataFrame' [package "sp"] with 5 slots
  ..@ data       :'data.frame': 77 obs. of  3 variables:
  ..@ polygons   :List of 77
  ..@ plotOrder  : int [1:77] 75 77 76 71 63 67 28 69 74 38 ...
  ..@ bbox       : num [1:2, 1:2] 913175 120122 1067383 272844
  .. ..- attr(*, "dimnames")=List of 2
  ..@ proj4string:Formal class 'CRS' [package "sp"] with 1 slot
tm_shape(nypp) +
  tm_fill() +
  tm_borders() 

I know from NYC website that Manhattan precincts are 1 to 34.

mnh<- subset(nypp, nypp$Precinct<=34)

tm_shape(mnh) +
  tm_fill() +
  tm_borders() 

a) Number of tickets, total fines, and average fines

Provide three maps that show choropleth maps of:

  • the total number of tickets
  • the total amount of fines
  • the average amount of fines

Briefly describe what you learn from these maps in comparison.

mnh_df <- df %>%
  filter(`Violation Precinct` > 0 & `Violation Precinct` <= 34 )

Now I need to merge the map with the violation’s dataset. However, since the resulting object would be too big, I’ll make little datasets related to the question and merge those one by one.

mnh
class       : SpatialPolygonsDataFrame 
features    : 22 
extent      : 971013.5, 1009023, 188082.3, 259027.5  (xmin, xmax, ymin, ymax)
crs         : +proj=lcc +lat_0=40.1666666666667 +lon_0=-74 +lat_1=41.0333333333333 +lat_2=40.6666666666667 +x_0=300000 +y_0=0 +datum=NAD83 +units=us-ft +no_defs 
variables   : 3
names       : Precinct,    Shape_Leng,    Shape_Area 
min values  :        1, 17098.9870878,  15289544.596 
max values  :       34, 80969.4264974, 52113460.9163 
mnh@data[["Precinct"]]
 [1]  1  5  6  7  9 10 13 14 17 18 19 20 22 23 24 25 26 28 30 32 33 34
sort((unique(mnh_df$`Violation Precinct`)))
 [1]  1  2  3  4  5  6  7  8  9 10 12 13 14 15 17 18 19 20 21 22 23 24 25 26 27 28 30 32 33 34
  • the total number of tickets
tickets <- mnh_df %>%
  group_by(`Violation Precinct`) %>%
  count() %>%
  rename(`Total Number of Tickets` = "n")
tickets_map <- merge(mnh, tickets, by.x = "Precinct", by.y = "Violation Precinct", all.x = TRUE, duplicateGeoms = TRUE)
tm_shape(tickets_map) +
  tm_fill(col = "Total Number of Tickets") +
  tm_borders() 

  • the total amount of fines
total_fines <- mnh_df %>%
  dplyr::select(`Violation Precinct`, `Fine Amount $`) %>%
  group_by(`Violation Precinct`) %>% 
  summarize(`Total Fines Amount $` = sum(`Fine Amount $`, na.rm = TRUE))
fines_map <- tickets_map <- merge(mnh, total_fines, by.x = "Precinct", by.y = "Violation Precinct", all.x = TRUE, duplicateGeoms = TRUE)
tm_shape(fines_map) +
  tm_fill(col = "Total Fines Amount $") +
  tm_borders() 

  • the average amount of fines
average_fines <- mnh_df %>%
  dplyr::select(`Violation Precinct`, `Fine Amount $`) %>%
  group_by(`Violation Precinct`) %>% 
  summarize(`Mean Fines Amount $` = mean(`Fine Amount $`, na.rm = TRUE))
mnh_avg_fines <- merge(mnh, average_fines, by.x = "Precinct", by.y = "Violation Precinct", all.x = TRUE, duplicateGeoms = TRUE)
tm_shape(mnh_avg_fines) +
  tm_fill(col = "Mean Fines Amount $") +
  tm_borders() 

b) Types of violations

Group the almost 100 types of ticket violations into a smaller set of 4-6 subgroups (where other should be the remainder of violations not included in other groups you defined). [Hint: No need to spend more than 5 minutes thinking about what the right grouping is.]. Provide choropleth maps for each of these subgroups to show where different types of violations are more or less common.

parking_list = c(9:11, 13:28, 30:33, 37:39, 44, 46, 47, 55, 59, 60, 62:65, 77)
sticker_list = c(68:76)
m2b <- mnh_df %>%
  dplyr::select(`Violation Precinct`, `Violation Description`, `Violation Code`) %>%
  mutate(
    Type = case_when(
      `Violation Code` == 1 | `Violation Code` == 4 | `Violation Code` == 5 | `Violation Code` == 12 | `Violation Code` == 18 | `Violation Code` == 19 ~ "BUS related violations",
      `Violation Code` == 67 |`Violation Code`== 50 | `Violation Code` == 51 | `Violation Code` == 48 ~ "Violations against pedestrian traffic",
      `Violation Code` %in% parking_list ~ "Parking violations",
      `Violation Code` %in% sticker_list ~ "Sticker violations",
      TRUE ~ "Other"
    )
  )
other <- m2b %>%
  filter(`Type` == "Other") %>%
  group_by(`Violation Precinct`) %>%
  count() %>%
  rename(`Other Violations` = "n")
pedestrians <- m2b %>%
  filter(`Type` == "Violations against pedestrian traffic") %>%
  group_by(`Violation Precinct`) %>%
  count() %>%
  rename(`Violations vs Pedestrians` = "n")
parking <- m2b %>%
  filter(`Type` == "Parking violations") %>%
  group_by(`Violation Precinct`) %>%
  count() %>%
  rename(`Parking Violations` = "n")
sticker <- m2b %>%
  filter(`Type` == "Sticker violations") %>%
  group_by(`Violation Precinct`) %>%
  count() %>%
  rename(`Sticker Violations` = "n")
mnh_map <- merge(mnh_map, parking, by.x = "Precinct", by.y = "Violation Precinct", all.x = TRUE, duplicateGeoms = TRUE)
Error in h(simpleError(msg, call)) : 
  error in evaluating the argument 'x' in selecting a method for function 'merge': oggetto "mnh_map" non trovato
tm_other <- tm_shape(mnh_map) +
  tm_fill(col = "Other Violations") +
  tm_borders() 
tm_pede <- tm_shape(mnh_map) +
  tm_fill(col = "Violations vs Pedestrians") +
  tm_borders() 
tm_parking <- tm_shape(mnh_map) +
  tm_fill(col = "Parking Violations") +
  tm_borders() 
tm_sticker <- tm_shape(mnh_map) +
  tm_fill(col = "Sticker Violations") +
  tm_borders() 
tmap_arrange(tm_sticker, tm_parking, tm_pede, tm_other)


3. Focus on the Upper East

Precinct 19 identifies the Upper East Side. The data currently does not provide latitude and longitude of the violation locations (and I am not sure what these street_code variables are for).

a) Ignoring fire hydrants

Restrict your data to parking violations related to fire hydrants (Violation Code = 40). Using the variables Street Name and House Number as well as the knowledge that these addresses are in the Upper East Side of Manhattan, geocode at least 500 addresses. Include a data table of these addresses and the latitude and longitude of these addresses in the output.

geoloc_df <- mnh_df %>%
  filter(`Violation Precinct` == 19 & `Violation Code` == 40) %>%
  dplyr::select(`Street Name`, `House Number`, `issue_date`, `Violation Description`, `Fine Amount $`, `Plate ID`, `Vehicle Make`)
geoloc_df$`House Number` <- as.numeric(geoloc_df$`House Number`)
geoloc_df <- drop_na(geoloc_df)

geoloc_df <- geoloc_df %>%
  unite(`Street Name`, `House Number`, col = "Location", sep = ", ", remove = FALSE)

geoloc_df$Location <- paste("United States, NY, New York,", geoloc_df$Location, sep=" ")

head(geoloc_df$Location)

geo_df <- geoloc_df[sample(nrow(geoloc_df), 700), ]
example <- base::sample(x = geo_df$Location, 
                               size = 1, 
                               replace = TRUE)
example
ggmap::geocode(location = example)
geocoded_df <- dplyr::bind_cols(geo_df$Location, GeoCoded) %>% 
  dplyr::select(
    lng = lon,
    lat,
    dplyr::everything())
New names:
* NA -> ...1
rename(geocoded_df, "Address" = `...1`)
datatable(geocoded_df,
          filter = 'top',
          options = list(pageLength = 5, autoWidth = TRUE),
          colnames = c('lng', 'lat', 'Address'),
          )
b) Interactive Map

Provide an interactive map of the violations you geocoded using leaflet. Provide at least three pieces of information on the parking ticket in a popup.

library(leaflet)
geo_df <- dplyr::bind_cols(geo_df, GeoCoded) %>% 
  dplyr::select(
    lng = lon,
    lat,
    dplyr::everything())
# creating a popuptext column
geo_df <- geo_df %>%  
  dplyr::mutate(popuptext = base::paste0("<b>", 
                                 str_to_title(geo_df$`Violation Description`), " Violation",
                                 "</b><br />",
                                 "<i>", "Date: ",
                                 geo_df$issue_date, 
                                 ", ", "Fine Amount: $",
                                 geo_df$`Fine Amount $`,
                                 "</i><br />",
                                 "<i>"))
leaflet(geo_df) %>%
  addTiles() %>%
  addCircles(lng = ~lng, lat = ~lat, color = "orange", popup = ~popuptext) %>%
  setView(lng = -73.956189, lat = 40.774917, zoom = 14) %>%
  addProviderTiles(providers$CartoDB.DarkMatter)
c) Luxury cars and repeat offenders

Using the vehicle Plate ID, identify repeat offenders (in the full dataset). Create another variable called luxury_car in which you identify luxury car brands using the Vehicle Make variable.

Start with the previous map. Distinguish the points by whether the car is a repeat offender and/or luxury car. Add a legend informing the user about the color scheme. Also make sure that the added information about the car type and repeat offender status is now contained in the popup information. Show this map.

gino <- mnh_df %>%
  dplyr::select(`Plate ID`) %>%
  group_by(`Plate ID`) %>%
  add_count(`Plate ID`) %>%
  unique() %>%
  mutate(`Repeat Offender` = ifelse(n > 1, paste("Yes"), paste("No")))
gino %>%
  arrange(desc(n)) %>%
  head(10)

Wow! Besides “BLANKPLATE”, some of these people could have gone broke for all these fines!

sort(unique(mnh_df$`Vehicle Make`))
luxury_cars = c("MASE", "FERRA", "FER", "ASTO", "ROLLS", "MAYBA", "BUGAT", "BENT", "TESLA", "TELSA", "TSLA", "PORSC", "ROLLI", "AUDI", "BMW", "LAMBO", "MERC")
geo_df <- geo_df %>%
  mutate(
    `Car Type` = case_when(
    `Vehicle Make` %in% luxury_cars ~ "Luxury car",
      TRUE ~ "Non-luxury car"
  ))
geo_df <- left_join(geo_df, gino, by = "Plate ID", all.x = TRUE)
library(RColorBrewer)
pal = colorFactor("Set1", domain = geo_df$`Repeat Offender`)
rpof_color = pal(geo_df$`Repeat Offender`)
popup <- paste("<b>", "Plate ID: ", "</b>", geo_df$`Plate ID`, "<br/>",
               "<b>","Car Type: ", "</b>", geo_df$`Car Type`, "<br/>",
               "<b>","Repeat Offender: ", "</b>", geo_df$`Repeat Offender`, "<br/>")
leaflet(geo_df) %>%
  addTiles() %>%
  addCircles(lng = ~lng, lat = ~lat, color = rpof_color, popup = ~popup) %>%
  addLegend(pal = pal, values = ~geo_df$`Repeat Offender`, title = "Repeat Offender") %>%
  setView(lng = -73.956189, lat = 40.774917, zoom = 14) %>%
  addProviderTiles(providers$CartoDB.DarkMatter)
d) Cluster

Add marker clustering, so that zooming in will reveal the individual locations but the zoomed out map only shows the clusters. Show the map with clusters.

leaflet(geo_df) %>%
  addTiles() %>%
  addCircleMarkers(color = rpof_color, popup = popup, clusterOptions = markerClusterOptions()) %>%
  addLegend(pal = pal, values = ~geo_df$`Repeat Offender`, title = "Repeat Offender") %>%
  setView(lng = -73.956189, lat = 40.774917, zoom = 14) %>%
  addProviderTiles(providers$CartoDB.DarkMatter)
Assuming "lng" and "lat" are longitude and latitude, respectively
---
title: 'Assignment 2: Mapping Parking Violations in NYC'
author: "Davide Vaccari"
date: "3/17/2021"
output: 
  html_notebook:
    toc: TRUE
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, error = FALSE, warning = FALSE, message = FALSE)
knitr::opts_knit$set(root.dir = normalizePath("/Users/davidev7/Documents/Columbia/Second Semester/Data Visualization/course_content-main/Exercises/07_parking-graded/data/parking"))
```

```{r}
library(tidyverse)
library(dplyr)
library(ggplot2)
library(rgdal)
library(tmap)
library(readxl)
library(ggrepel)
library(ggthemes)
library(scales)
library(sf)
library(raster)
library(rgeos)
library(ggmap)
library(DT)
library(devtools)
```

```{r}
ggmap::register_google(key = "AIzaSyB1-MjiXEIrgdT3FbflMLc8EUaQXVG3XVY")
```

```{r}
devtools::install_github("rstudio/leaflet")
```

Parking Violations in NYC
================================

```{r, results='hide'}
df <- read_csv("parkingNYC_Jan2020-Jan2021.csv")
```
```{r}
fines <- read_excel("ParkingViolationCodes_January2020.xlsx")
```

##### 1. Data exploration

```{r}
head(df)
colnames(df)
df <- dplyr::select(df, -"Violation Description")
```
```{r}
colnames(fines) <- str_to_title(colnames(fines))
fines <- fines[c(1:3)]
colnames(fines)[3] <- "Fine Amount $"
head(fines)
```

Violation 38 [FAIL TO DSPLY MUNI METER RECPT] and 69 [FAIL TO DISP. MUNI METER RECPT] look like the same infraction. The fine is $65 for both. I'd rename one as the other and aggregate them as the same violation.

```{r}
fines$`Violation Description`[69] <- "FAIL TO DSPLY MUNI METER RECPT"
```

```{r}
df <- merge(x = df, y = fines, by = "Violation Code", all.x = TRUE)
```

##### a) Violation Code and Fine Amounts

Add the violation code descriptions and fine amounts to the data file. Provide a visual overview of the top 10 most common types of violations (feel free to group them into categories if reasonable). Compare how this ranking differs if we focus on the total amount of revenue generated.

**Top 10 Violations**

```{r}
one_a <- df %>%
  dplyr::select(`Violation Description`) %>%
  group_by(`Violation Description`) %>%
  count() %>%
  ungroup() %>%
  arrange(desc(n)) %>%
  head(10)
```

```{r}
legend_ord <- levels(with(one_a, reorder(`Violation Description`, desc(n))))
```

```{r}
one_a %>%
  ggplot(aes(x = reorder(`Violation Description`, desc(n)), y = n, fill = `Violation Description`)) +
  geom_col(alpha = 0.8) +
  scale_fill_brewer(palette = "RdBu", breaks = legend_ord) +
  
  labs(
  title = "Top 10 most common types of violations in NYC",
  x = "",
  y = "Number of Violations",
  x.axis = "",
  caption = "Source: data.cityofnewyork.us"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma)
```

**Top 10 Violations for revenue**

```{r}
one_a_2 <- df %>%
  dplyr::select(`Violation Description`, `Fine Amount $`) %>%
  group_by(`Violation Description`,`Fine Amount $`) %>% 
  summarize(`Total Fines Amount $` = sum(`Fine Amount $`)) %>%
  arrange(desc(`Total Fines Amount $`)) %>%
  head(10)
```

```{r}
legend_ord2 <- levels(with(one_a_2, reorder(`Violation Description`, desc(`Total Fines Amount $`))))
```

```{r}
one_a_2 %>%
  ggplot(aes(x = reorder(`Violation Description`, desc(`Total Fines Amount $`)), y = `Total Fines Amount $`, fill = `Violation Description`)) +
  geom_col(alpha = 0.8) +
  scale_fill_brewer(palette = "RdBu", breaks = legend_ord2) +
  
  labs(
  title = "Top 10 highest violations in NYC per revenue",
  x = "",
  y = "Dollar ($)",
  x.axis = "",
  caption = "Source: data.cityofnewyork.us"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma)
```


##### b) Average amount of fine by vehicle

Compare the average amount of fine by vehicle color, vehicle year, and [vehicle plate type](https://dmv.ny.gov/registration/registration-class-codes) [Hint: it is sufficient to restrict your attention to commercial (`COM`) and passenger (`PAS`) vehicles]? Briefly describe your findings.

**Vehicle Color**

```{r}
one_b_1 <- df %>%
  dplyr::select(`Vehicle Color`, `Fine Amount $`) %>%
  drop_na(`Fine Amount $`) %>%
  group_by(`Vehicle Color`) %>%
  add_count(`Vehicle Color`) %>%
  filter(n > 10) %>%
  summarize(`Average Fine` = mean(`Fine Amount $`)) %>%
  arrange(desc(`Average Fine`)) %>%
  head(10)
```

```{r}
legend_ord3 <- levels(with(one_b_1, reorder(`Vehicle Color`, desc(`Average Fine`))))
```

```{r}
one_b_1 %>%
  ggplot(aes(x = reorder(`Vehicle Color`, desc(`Average Fine`)), y = `Average Fine`, fill = `Vehicle Color`)) +
  geom_col(alpha = 0.8) +
  scale_fill_brewer(palette = "RdBu", breaks = legend_ord3) +
  
  labs(
  title = "Top 10 highest Average Fine per Car Color",
  x = "",
  y = "Dollar ($)",
  x.axis = "",
  caption = "Source: data.cityofnewyork.us"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma)
```


**Vehicle Year**

```{r}
one_b_2 <- df %>%
  dplyr::select(`Vehicle Year`, `Fine Amount $`) %>%
  group_by(`Vehicle Year`) %>%
  summarize(`Average Fine` = mean(`Fine Amount $`)) %>%
  filter(`Vehicle Year` > 0 & `Vehicle Year` <= 2021) %>%
  arrange(desc(`Average Fine`)) %>%
  head(10)
```

```{r}
legend_ord4 <- levels(with(one_b_2, reorder(`Vehicle Year`, desc(`Average Fine`))))
```

```{r}
one_b_2 %>%
  ggplot(aes(x = reorder(`Vehicle Year`, desc(`Average Fine`)), y = `Average Fine`, fill = as.factor(`Vehicle Year`))) +
  geom_col(alpha = 0.8) +
  scale_fill_brewer(palette = "RdBu", breaks = legend_ord3) +
  
  labs(
  title = "Top 10 highest Average Fine per Vehicle Registration Year",
  x = "Registration Year",
  y = "Dollar ($)",
  x.axis = "",
  caption = "Source: data.cityofnewyork.us"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma)
```


**Vehicle Plate Type**

```{r}
one_b_3 <- df %>%
  dplyr::select(`Plate Type`, `Fine Amount $`) %>%
  group_by(`Plate Type`) %>%
  filter(`Plate Type` == "COM" | `Plate Type` == "PAS") %>%
  summarize(`Average Fine` = mean(`Fine Amount $`, na.rm=TRUE)) %>%
  arrange(desc(`Average Fine`))
```

```{r}
one_b_3 %>%
  ggplot(aes(x = reorder(`Plate Type`, desc(`Average Fine`)), y = `Average Fine`, fill = `Plate Type`)) +
  geom_col(alpha = 0.8) +
  scale_fill_brewer(palette = "RdYlBu") +
  
  geom_hline(yintercept=89.704) +
  
  labs(
  title = "Average Fine per Car Type",
  x = "",
  y = "Dollar ($)",
  x.axis = "",
  caption = "Source: data.cityofnewyork.us"
  ) +
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect(),
   
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    
    legend.key = element_rect(fill = "white", colour = "white"),
    legend.background = element_rect()
    ) + 
  
  scale_y_continuous(labels = comma, limits = c(0, 100))
```


##### c) Effect of COVID

Let's see if we can observe the effect of COVID restrictions on parking violations. Present a visualization that shows how parking violations changed after the New York statewide stay-at-home order on March 14, 2020. Make sure the visualization clearly highlights the main pattern (the COVID effect).


```{r}
one_c <- df %>%
  filter(`year` == 2020) %>%
  dplyr::select(`month`, `Violation Description`) %>%
  add_count(`Violation Description`) %>%
  group_by(`month`) %>%
  summarize(Mean = mean(n))
```

```{r}
one_c %>%

  ggplot(aes(x = `month`, y = Mean)) +
  
  geom_line() +
  geom_vline(xintercept=3.5) +
  xlim(c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) +
  ylim(50000,200000) +
  
  geom_label(
  label="New York statewide\nstay-at-home order", 
  x=2.2,
  y=145000,
  label.padding = unit(0.55, "lines"),
  label.size = 0.15,
  color = "black",
  fill="lightcyan"
  ) +
  
  labs(
    title = "NY Violations in 2020",
    x = "2020 Months", 
    y = "Average Violations per Month",
    caption = "Source: data.cityofnewyork.us"
    ) + 
  
  
  theme_tufte() + 
  theme(
    plot.title = element_text(size = 13, face = "bold"),
    plot.caption = element_text(hjust = 0, face = "italic"),
    plot.background = element_rect()
    ) 
```

I tried my best (!), but I can't really say there has been a "COVID effect". Violations appear to be lower in the first months of 2020 compared to the summer. Probably snow and hard weather have an harder impact on traffic than Covid restrictions.

___

#### 2. Map by Precincts

Read in the shape files for the police precincts and remove all precincts outside of Manhattan.

```{r}
setwd("/Users/davidev7/Documents/Columbia/Second Semester/Data Visualization/course_content-main/Exercises/07_parking-graded/data")
file.exists('/Users/davidev7/Documents/Columbia/Second Semester/Data Visualization/course_content-main/Exercises/07_parking-graded/data/nypp_21a')

nypp <- readOGR(dsn = '/Users/davidev7/Documents/Columbia/Second Semester/Data Visualization/course_content-main/Exercises/07_parking-graded/data/nypp_21a', layer = "nypp")
```
```{r}
str(nypp,max.level = 2)
```

```{r}
tm_shape(nypp) +
  tm_fill() +
  tm_borders() 
```

I know from [NYC website](https://www1.nyc.gov/site/nypd/bureaus/patrol/precincts-landing.page) that Manhattan precincts are 1 to 34.

```{r}
mnh<- subset(nypp, nypp$Precinct<=34)

tm_shape(mnh) +
  tm_fill() +
  tm_borders() 
```

##### a) Number of tickets, total fines, and average fines

Provide three maps that show choropleth maps of:

  - the total number of tickets 
  - the total amount of fines 
  - the average amount of fines
  
Briefly describe what you learn from these maps in comparison.

```{r}
mnh_df <- df %>%
  filter(`Violation Precinct` > 0 & `Violation Precinct` <= 34 )
```

Now I need to merge the map with the violation's dataset. However, since the resulting object would be too big, I'll make little datasets related to the question and merge those one by one.

```{r}
mnh
mnh@data[["Precinct"]]
sort((unique(mnh_df$`Violation Precinct`)))
```

- the total number of tickets 
 
```{r}
tickets <- mnh_df %>%
  group_by(`Violation Precinct`) %>%
  count() %>%
  rename(`Total Number of Tickets` = "n")
```

```{r}
tickets_map <- merge(mnh, tickets, by.x = "Precinct", by.y = "Violation Precinct", all.x = TRUE, duplicateGeoms = TRUE)
```

```{r}
tm_shape(tickets_map) +
  tm_fill(col = "Total Number of Tickets") +
  tm_borders() 
```

- the total amount of fines

```{r}
total_fines <- mnh_df %>%
  dplyr::select(`Violation Precinct`, `Fine Amount $`) %>%
  group_by(`Violation Precinct`) %>% 
  summarize(`Total Fines Amount $` = sum(`Fine Amount $`, na.rm = TRUE))
```

```{r}
fines_map <- tickets_map <- merge(mnh, total_fines, by.x = "Precinct", by.y = "Violation Precinct", all.x = TRUE, duplicateGeoms = TRUE)
```

```{r}
tm_shape(fines_map) +
  tm_fill(col = "Total Fines Amount $") +
  tm_borders() 
```

- the average amount of fines

```{r}
average_fines <- mnh_df %>%
  dplyr::select(`Violation Precinct`, `Fine Amount $`) %>%
  group_by(`Violation Precinct`) %>% 
  summarize(`Mean Fines Amount $` = mean(`Fine Amount $`, na.rm = TRUE))
```

```{r}
mnh_avg_fines <- merge(mnh, average_fines, by.x = "Precinct", by.y = "Violation Precinct", all.x = TRUE, duplicateGeoms = TRUE)
```

```{r}
tm_shape(mnh_avg_fines) +
  tm_fill(col = "Mean Fines Amount $") +
  tm_borders() 
```

##### b) Types of violations

Group the almost 100 types of ticket violations into a smaller set of 4-6 subgroups (where `other` should be the remainder of violations not included in other groups you defined). [Hint: No need to spend more than 5 minutes thinking about what the right grouping is.]. Provide choropleth maps for each of these subgroups to show where different types of violations are more or less common. 

```{r}
parking_list = c(9:11, 13:28, 30:33, 37:39, 44, 46, 47, 55, 59, 60, 62:65, 77)
sticker_list = c(68:76)
```

```{r}
m2b <- mnh_df %>%
  dplyr::select(`Violation Precinct`, `Violation Description`, `Violation Code`) %>%
  mutate(
    Type = case_when(
      `Violation Code` == 1 | `Violation Code` == 4 | `Violation Code` == 5 | `Violation Code` == 12 | `Violation Code` == 18 | `Violation Code` == 19 ~ "BUS related violations",
      `Violation Code` == 67 |`Violation Code`== 50 | `Violation Code` == 51 | `Violation Code` == 48 ~ "Violations against pedestrian traffic",
      `Violation Code` %in% parking_list ~ "Parking violations",
      `Violation Code` %in% sticker_list ~ "Sticker violations",
      TRUE ~ "Other"
    )
  )
```


```{r}
other <- m2b %>%
  filter(`Type` == "Other") %>%
  group_by(`Violation Precinct`) %>%
  count() %>%
  rename(`Other Violations` = "n")
```

```{r}
pedestrians <- m2b %>%
  filter(`Type` == "Violations against pedestrian traffic") %>%
  group_by(`Violation Precinct`) %>%
  count() %>%
  rename(`Violations vs Pedestrians` = "n")
```

```{r}
parking <- m2b %>%
  filter(`Type` == "Parking violations") %>%
  group_by(`Violation Precinct`) %>%
  count() %>%
  rename(`Parking Violations` = "n")
```

```{r}
sticker <- m2b %>%
  filter(`Type` == "Sticker violations") %>%
  group_by(`Violation Precinct`) %>%
  count() %>%
  rename(`Sticker Violations` = "n")
```

```{r}
mnh_map <- merge(mnh, other, by.x = "Precinct", by.y = "Violation Precinct", all.x = TRUE, duplicateGeoms = TRUE)
mnh_map <- merge(mnh_map, pedestrians, by.x = "Precinct", by.y = "Violation Precinct", all.x = TRUE, duplicateGeoms = TRUE)
mnh_map <- merge(mnh_map, parking, by.x = "Precinct", by.y = "Violation Precinct", all.x = TRUE, duplicateGeoms = TRUE)
mnh_map <- merge(mnh_map, sticker, by.x = "Precinct", by.y = "Violation Precinct", all.x = TRUE, duplicateGeoms = TRUE)
```

```{r}
tm_other <- tm_shape(mnh_map) +
  tm_fill(col = "Other Violations") +
  tm_borders() 
```

```{r}
tm_pede <- tm_shape(mnh_map) +
  tm_fill(col = "Violations vs Pedestrians") +
  tm_borders() 
```

```{r}
tm_parking <- tm_shape(mnh_map) +
  tm_fill(col = "Parking Violations") +
  tm_borders() 
```

```{r}
tm_sticker <- tm_shape(mnh_map) +
  tm_fill(col = "Sticker Violations") +
  tm_borders() 
```

```{r}
tmap_arrange(tm_sticker, tm_parking, tm_pede, tm_other)
```

___
#### 3. Focus on the Upper East

[Precinct 19](https://www1.nyc.gov/site/nypd/bureaus/patrol/precincts/19th-precinct.page) identifies the Upper East Side. The data currently does not provide latitude and longitude of the violation locations (and I am not sure what these `street_code` variables are for).

##### a) Ignoring fire hydrants

Restrict your data to parking violations related to fire hydrants (`Violation Code = 40`). Using the variables `Street Name` and `House Number` as well as the knowledge that these addresses are in the Upper East Side of Manhattan, geocode at least 500 addresses. Include a data table of these addresses and the latitude and longitude of these addresses in the output.

```{r}
geoloc_df <- mnh_df %>%
  filter(`Violation Precinct` == 19 & `Violation Code` == 40) %>%
  dplyr::select(`Street Name`, `House Number`, `issue_date`, `Violation Description`, `Fine Amount $`, `Plate ID`, `Vehicle Make`)
```

```{r}
geoloc_df$`House Number` <- as.numeric(geoloc_df$`House Number`)
geoloc_df <- drop_na(geoloc_df)

geoloc_df <- geoloc_df %>%
  unite(`Street Name`, `House Number`, col = "Location", sep = ", ", remove = FALSE)

geoloc_df$Location <- paste("United States, NY, New York,", geoloc_df$Location, sep=" ")

head(geoloc_df$Location)

geo_df <- geoloc_df[sample(nrow(geoloc_df), 700), ]
```

```{r}
example <- base::sample(x = geo_df$Location, 
                               size = 1, 
                               replace = TRUE)
example
ggmap::geocode(location = example)
```

```{r, results='hide'}
GeoCoded <- purrr::map_df(.x = geo_df$Location, .f = ggmap::geocode)
```

```{r}
geocoded_df <- dplyr::bind_cols(geo_df$Location, GeoCoded) %>% 
  dplyr::select(
    lng = lon,
    lat,
    dplyr::everything())

rename(geocoded_df, "Address" = `...1`)
```

```{r}
datatable(geocoded_df,
          filter = 'top',
          options = list(pageLength = 5, autoWidth = TRUE),
          colnames = c('lng', 'lat', 'Address'),
          )
```

##### b) Interactive Map

Provide an interactive map of the violations you geocoded using `leaflet`. Provide at least three pieces of information on the parking ticket in a popup.

```{r}
library(leaflet)
```

```{r}
geo_df <- dplyr::bind_cols(geo_df, GeoCoded) %>% 
  dplyr::select(
    lng = lon,
    lat,
    dplyr::everything())
```


```{r}
# creating a popuptext column
geo_df <- geo_df %>%  
  dplyr::mutate(popuptext = base::paste0("<b>", 
                                 str_to_title(geo_df$`Violation Description`), " Violation",
                                 "</b><br />",
                                 "<i>", "Date: ",
                                 geo_df$issue_date, 
                                 ", ", "Fine Amount: $",
                                 geo_df$`Fine Amount $`,
                                 "</i><br />",
                                 "<i>"))
```


```{r}
leaflet(geo_df) %>%
  addTiles() %>%
  addCircles(lng = ~lng, lat = ~lat, color = "orange", popup = ~popuptext) %>%
  setView(lng = -73.956189, lat = 40.774917, zoom = 14) %>%
  addProviderTiles(providers$CartoDB.DarkMatter)
```

##### c) Luxury cars and repeat offenders

Using the vehicle `Plate ID`, identify repeat offenders (in the full dataset). Create another variable called `luxury_car` in which you identify luxury car brands using the `Vehicle Make` variable.

Start with the previous map. Distinguish the points by whether the car is a repeat offender and/or luxury car. Add a legend informing the user about the color scheme. Also make sure that the added information about the car type and repeat offender status is now contained in the popup information. Show this map.

```{r}
gino <- mnh_df %>%
  dplyr::select(`Plate ID`) %>%
  group_by(`Plate ID`) %>%
  add_count(`Plate ID`) %>%
  unique() %>%
  mutate(`Repeat Offender` = ifelse(n > 1, paste("Yes"), paste("No")))
```

```{r}
gino %>%
  arrange(desc(n)) %>%
  head(10)
```

Wow! Besides "BLANKPLATE", some of these people could have gone broke for all these fines!

```{r}
sort(unique(mnh_df$`Vehicle Make`))
```
```{r}
luxury_cars = c("MASE", "FERRA", "FER", "ASTO", "ROLLS", "MAYBA", "BUGAT", "BENT", "TESLA", "TELSA", "TSLA", "PORSC", "ROLLI", "AUDI", "BMW", "LAMBO", "MERC")
```

```{r}
geo_df <- geo_df %>%
  mutate(
    `Car Type` = case_when(
    `Vehicle Make` %in% luxury_cars ~ "Luxury car",
      TRUE ~ "Non-luxury car"
  ))
```

```{r}
geo_df <- left_join(geo_df, gino, by = "Plate ID", all.x = TRUE)
```

```{r}
library(RColorBrewer)
```

```{r}
pal = colorFactor("Set1", domain = geo_df$`Repeat Offender`)
rpof_color = pal(geo_df$`Repeat Offender`)
```

```{r}
popup <- paste("<b>", "Plate ID: ", "</b>", geo_df$`Plate ID`, "<br/>",
               "<b>","Car Type: ", "</b>", geo_df$`Car Type`, "<br/>",
               "<b>","Repeat Offender: ", "</b>", geo_df$`Repeat Offender`, "<br/>")
```

```{r}
leaflet(geo_df) %>%
  addTiles() %>%
  addCircles(lng = ~lng, lat = ~lat, color = rpof_color, popup = ~popup) %>%
  addLegend(pal = pal, values = ~geo_df$`Repeat Offender`, title = "Repeat Offender") %>%
  setView(lng = -73.956189, lat = 40.774917, zoom = 14) %>%
  addProviderTiles(providers$CartoDB.DarkMatter)
```

##### d) Cluster

Add marker clustering, so that zooming in will reveal the individual locations but the zoomed out map only shows the clusters. Show the map with clusters.

```{r}
leaflet(geo_df) %>%
  addTiles() %>%
  addCircleMarkers(color = rpof_color, popup = popup, clusterOptions = markerClusterOptions()) %>%
  addLegend(pal = pal, values = ~geo_df$`Repeat Offender`, title = "Repeat Offender") %>%
  setView(lng = -73.956189, lat = 40.774917, zoom = 14) %>%
  addProviderTiles(providers$CartoDB.DarkMatter)
```